energy <- read_csv(here("data", "energy.csv"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## month = col_character(),
## res_total = col_double(),
## ind_total = col_double()
## )
energy_ts <- energy %>%
mutate(date = yearmonth(month)) %>%
as_tsibble(key = NULL, index = date)
ggplot(data = energy_ts, aes(x = date, y = res_total)) +
geom_line(col = "dodgerblue") +
labs(y = "Residential energy consumption \n (Trillion BTU)")
energy_ts %>%
gg_season(y = res_total) +
theme_minimal() +
labs(x = "month",
y = "residential energy consumption (trillion BTU)")
energy_ts %>% gg_subseries(res_total)
Our takeaway here is similar: there is clear seasonality (higher values in winter months), with an increasingly evident second peak in June/July/August. This reinforces our takeaways from the raw data and seasonplots.
STL is an acronym for “Seasonal and Trend decomposition using Loess”, while Loess is a method for estimating nonlinear relationships
# Find STL decompostion
dcmp <- energy_ts %>%
model(STL(res_total ~ season()))
# View the components
components(dcmp)
## # A dable: 538 x 7 [1M]
## # Key: .model [1]
## # STL Decomposition: res_total = trend + season_year + remainder
## .model date res_total trend season_year remainder season_adjust
## <chr> <mth> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 STL(res_total ~… 1973 Jan 1932. 1257. 684. -9.21 1248.
## 2 STL(res_total ~… 1973 Feb 1687. 1253. 465. -31.3 1222.
## 3 STL(res_total ~… 1973 Mar 1497. 1250. 242. 5.67 1255.
## 4 STL(res_total ~… 1973 Apr 1178. 1246. -62.9 -5.49 1241.
## 5 STL(res_total ~… 1973 May 1015. 1242. -256. 28.7 1271.
## 6 STL(res_total ~… 1973 Jun 934. 1238. -313. 8.89 1247.
## 7 STL(res_total ~… 1973 Jul 981. 1234. -231. -22.0 1212.
## 8 STL(res_total ~… 1973 Aug 1019. 1231. -225. 13.7 1244.
## 9 STL(res_total ~… 1973 Sep 957. 1227. -314. 43.5 1271.
## 10 STL(res_total ~… 1973 Oct 992. 1224. -271. 39.4 1263.
## # … with 528 more rows
# Visualize the decomposed components
components(dcmp) %>%
autoplot() +
theme_minimal()
energy_ts %>%
ACF(res_total) %>%
autoplot()
# Create the model:
energy_fit <- energy_ts %>%
model(
ets = ETS(res_total ~ season("M"))
)
# Forecast using the model 10 years into the future:
energy_forecast <- energy_fit %>%
forecast(h = "10 years")
# Plot just the forecasted values (with 80 & 95% CIs):
energy_forecast %>%
autoplot()
# Or plot it added to the original data:
energy_forecast %>%
autoplot(energy_ts)
# Append the predicted values (and residuals) to original energy data
energy_predicted <- broom::augment(energy_fit)
# Use View(energy_predicted) to see the resulting data frame
Plotting the actual energy values (res_total), and the predicted values (stored as .fitted) atop them:
ggplot(data = energy_predicted) +
geom_line(aes(x = date, y = res_total)) +
geom_line(aes(x = date, y = .fitted),
color = "dodgerblue",
alpha = 0.85)
Now let’s explore the residuals. Remember, some important considerations: Residuals should be uncorrelated, centered at 0, and ideally normally distributed. One way we can check the distribution is with a histogram:
ggplot(data = energy_predicted, aes(x = .resid)) +
geom_histogram(fill = "dodgerblue",
color = "white",
bins = 20)
normally distributed, and centered at 0 (we could find summary statistics beyond this to further explore).
There are a number of other forecasting methods and models! You can learn more about ETS forecasting, seasonal naive (SNAIVE) and autoregressive integrated moving average (ARIMA) from Hyndman’s book - those are the models that I show below.
# Fit 3 different forecasting models (ETS, ARIMA, SNAIVE):
energy_fit_multi <- energy_ts %>%
model(
ets = ETS(res_total ~ season("M")),
arima = ARIMA(res_total),
snaive = SNAIVE(res_total)
)
# Forecast 3 years into the future (from data end date)
multi_forecast <- energy_fit_multi %>%
forecast(h = "3 years")
# Plot the 3 forecasts
multi_forecast %>%
autoplot(energy_ts)
# Or just view the forecasts (note the similarity across models):
multi_forecast %>%
autoplot()
ca_counties <- read_sf(here("data","ca_counties","CA_Counties_TIGER2016.shp"))
ca_subset <- ca_counties %>%
select(NAME, ALAND) %>%
rename(county_name = NAME, land_area = ALAND)
Use st_crs() to check the existing CRS for spatial data. We see that this CRS is WGS84 (epsg: 3857).
ca_subset %>% st_crs()
## Coordinate Reference System:
## User input: WGS 84 / Pseudo-Mercator
## wkt:
## PROJCRS["WGS 84 / Pseudo-Mercator",
## BASEGEOGCRS["WGS 84",
## DATUM["World Geodetic System 1984",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4326]],
## CONVERSION["Popular Visualisation Pseudo-Mercator",
## METHOD["Popular Visualisation Pseudo Mercator",
## ID["EPSG",1024]],
## PARAMETER["Latitude of natural origin",0,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",0,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["False easting",0,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",0,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["easting (X)",east,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["northing (Y)",north,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["unknown"],
## AREA["World - 85°S to 85°N"],
## BBOX[-85.06,-180,85.06,180]],
## ID["EPSG",3857]]
Plot the California counties using geom_sf(). Notice that we can update aesthetics just like we would for a regular ggplot object. Here, we update the color based on land area (and change the color gradient).
ggplot(data = ca_subset) +
geom_sf(aes(fill = land_area), color = "white", size = 0.1) +
theme_void() +
scale_fill_gradientn(colors = c("cyan","blue","purple"))
Red sesbania (Sesbania punicea) is an invasive plant (see more information from the California Invasive Plants Council). Observations for locations of invasive red sesbania are from CA DFW. See metadata and information here: https://map.dfg.ca.gov/metadata/ds0080.html
The data exist data/red_sesbania, and the shapefile is stored as ds80.shp. Let’s read in the data:
sesbania <- read_sf(here("data","red_sesbania","ds80.shp"))
# Check the CRS:
sesbania %>% st_crs()
## Coordinate Reference System:
## User input: Custom
## wkt:
## PROJCRS["Custom",
## BASEGEOGCRS["NAD83",
## DATUM["North American Datum 1983",
## ELLIPSOID["GRS 1980",6378137,298.257222101,
## LENGTHUNIT["metre",1]],
## ID["EPSG",6269]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["Degree",0.0174532925199433]]],
## CONVERSION["unnamed",
## METHOD["Albers Equal Area",
## ID["EPSG",9822]],
## PARAMETER["Latitude of false origin",0,
## ANGLEUNIT["Degree",0.0174532925199433],
## ID["EPSG",8821]],
## PARAMETER["Longitude of false origin",-120,
## ANGLEUNIT["Degree",0.0174532925199433],
## ID["EPSG",8822]],
## PARAMETER["Latitude of 1st standard parallel",34,
## ANGLEUNIT["Degree",0.0174532925199433],
## ID["EPSG",8823]],
## PARAMETER["Latitude of 2nd standard parallel",40.5,
## ANGLEUNIT["Degree",0.0174532925199433],
## ID["EPSG",8824]],
## PARAMETER["Easting at false origin",0,
## LENGTHUNIT["metre",1],
## ID["EPSG",8826]],
## PARAMETER["Northing at false origin",-4000000,
## LENGTHUNIT["metre",1],
## ID["EPSG",8827]]],
## CS[Cartesian,2],
## AXIS["(E)",east,
## ORDER[1],
## LENGTHUNIT["metre",1,
## ID["EPSG",9001]]],
## AXIS["(N)",north,
## ORDER[2],
## LENGTHUNIT["metre",1,
## ID["EPSG",9001]]]]
Notice that this CRS is different from the California counties CRS, so we’ll want to update it to match. Use st_transform() to update the CRS:
sesbania <- st_transform(sesbania, 3857) # transform sesbania to 3857 to match cali data
# Then check it:
sesbania %>% st_crs() # its now 3857
## Coordinate Reference System:
## User input: EPSG:3857
## wkt:
## PROJCRS["WGS 84 / Pseudo-Mercator",
## BASEGEOGCRS["WGS 84",
## DATUM["World Geodetic System 1984",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4326]],
## CONVERSION["Popular Visualisation Pseudo-Mercator",
## METHOD["Popular Visualisation Pseudo Mercator",
## ID["EPSG",1024]],
## PARAMETER["Latitude of natural origin",0,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",0,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["False easting",0,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",0,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["easting (X)",east,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["northing (Y)",north,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["unknown"],
## AREA["World - 85°S to 85°N"],
## BBOX[-85.06,-180,85.06,180]],
## ID["EPSG",3857]]
Plotting cali and sesbania together
ggplot() + # dont put data here to use 2 diff data sources
geom_sf(data = ca_subset) +
geom_sf(data = sesbania, size = 1, color = "red")
# took a minute to run
Let’s say we want to find the count of red sesbania observed locations in this dataset by county. How can I go about joining these data so that I can find counts? Don’t worry…st_join() has you covered for spatial joins!
ca_sesbania <- ca_subset %>%
st_join(sesbania)
And then we can find counts (note: these are not counts for individual plants, but by record in the dataset) by county:
sesbania_counts <- ca_sesbania %>%
count(county_name)
Then we can plot a chloropleth using the number of records for red sesbania as the fill color (instead of what we used previously, land area):
ggplot(data = sesbania_counts) +
geom_sf(aes(fill = n), color = "white", size = 0.1) +
scale_fill_gradientn(colors = c("lightgray","orange","red")) +
theme_minimal() +
labs(fill = "Number of S. punicea records")
So we see that we can still use our usual wrangling skills! Let’s do a bit more for fun, just to prove that our existing wrangling skills still work with spatial data - the spatial information just sticks to it! Only plot the county with the greatest number of red sesbania records (Solano), and make a map of those locations (yeah there are many ways to do this):
# Subset of sesbania point locations only in Solano County
solano_sesbania <- sesbania %>%
filter(COUNTY == "Solano")
# Only keep Solano polygon from California County data
solano <- ca_subset %>%
filter(county_name == "Solano")
ggplot() +
geom_sf(data = solano) +
geom_sf(data = solano_sesbania)
Sometimes we’ll want to make a map interactive so that audience members can zoom in, explore different areas, etc. We can use the {tmap} package to create an interactive map. Let’s make one for our California counties (fill aesthetic by land area) with the red sesbania locations on top:
# Set the viewing mode to "interactive":
tmap_mode(mode = "view")
## tmap mode set to interactive viewing
# Then make a map (with the polygon fill color updated by variable 'land_area', updating the color palette to "BuGn"), then add another shape layer for the sesbania records (added as dots):
tm_shape(ca_subset) +
tm_fill("land_area", palette = "BuGn") +
tm_shape(sesbania) +
tm_dots()